library(tidyr)
library(dplyr)
library(jsonlite)
library(purrr)
library(stringr)
library(lubridate)
library(modelr)
library(broom)
library(tibble)
library(ggplot2)
library(mgcv)

Apro il file JSON, per come è fatto devo accedere al primo elemento e posso quindi caricarlo in un tibble. Sono 14.536 liste.

complete <- read_json("complete.json")
programs <- complete[[1]]
programsT <- tibble(programs)
remove(complete)
remove(programs)

A questo punto ho la possibilità di accedere ai programmi, controllo che siano tutte liste di lunghezza 6.

programsT %>%
  mutate(programs_L = sapply(programs,length)) %>%
  filter(programs_L != 6)
## # A tibble: 0 × 2
## # … with 2 variables: programs <list>, programs_L <int>

Posso quindi espandere i programmi.

programsT <- programsT %>% unnest_wider(programs)
programsT %>% distinct(programID)
## # A tibble: 14,531 × 1
##    programID
##    <chr>    
##  1 3853     
##  2 5178     
##  3 10785    
##  4 5887     
##  5 305      
##  6 3368     
##  7 4226     
##  8 5087     
##  9 6310     
## 10 1979     
## # … with 14,521 more rows
programsT %>% select(programID) %>% n_distinct() # ci sono 14531 programmi
## [1] 14531

Nella maggior parte dei casi a un programma è associato un concerto, ma non è sempre così. In totale ci sono 22525 concerti.

programsT %>%
  mutate(concerts_L=sapply(concerts,length)) %>%
  count(concerts_L) %>%
  mutate(concertN = concerts_L * n) %>%
  mutate(concertsT = cumsum(concertN)) %>%
  arrange(-concertsT)
## # A tibble: 11 × 4
##    concerts_L     n concertN concertsT
##         <int> <int>    <int>     <int>
##  1         16     4       64     22525
##  2         12     4       48     22461
##  3          9     7       63     22413
##  4          8     2       16     22350
##  5          7     2       14     22334
##  6          6     5       30     22320
##  7          5    84      420     22290
##  8          4  1062     4248     21870
##  9          3  1033     3099     17622
## 10          2  2190     4380     14523
## 11          1 10143    10143     10143

Espando i concerti, ottengo una riga per ogni concerto. Passo da 14.536 a 22.525 righe. Aggiungo anche una colonna concertID. Passo a 7 colonne.

programsT <- programsT %>% unnest_longer(concerts)
nrow(programsT)
## [1] 22525
programsT <- rowid_to_column(programsT,var = "concertID")
programsT <- select(programsT,id,programID,orchestra,season,concertID,concerts,works)

Verifico che ogni concerto abbia 5 campi.

programsT %>%
  mutate(concerts_L = sapply(concerts, length)) %>%
  filter(concerts_L != 5)
## # A tibble: 0 × 8
## # … with 8 variables: id <chr>, programID <chr>, orchestra <chr>, season <chr>,
## #   concertID <int>, concerts <list>, works <list>, concerts_L <int>

Posso espandere con unnest_wider. Allargo il tibble di 4 colonne, al posto di concerts ho eventType, Location, Venue, Date e Time.

programsT <- programsT %>% unnest_wider(concerts)

Come si può vedere, un programma può contenere più concerti.

programsT %>% group_by(programID) %>% count(sort = TRUE)
## # A tibble: 14,531 × 2
## # Groups:   programID [14,531]
##    programID     n
##    <chr>     <int>
##  1 10700        16
##  2 10702        16
##  3 3128         16
##  4 3139         16
##  5 10701        12
##  6 10703        12
##  7 3134         12
##  8 3144         12
##  9 14385         9
## 10 14403         9
## # … with 14,521 more rows

Per esempio il programma 10700 contiene 16 concerti. In questo caso vediamo anche che di alcuni concerti non è indicata l’ora. Per questo motivo ho introdotto un concertID, se in seguito mi fossi limitato a identificare il concerto con eventType, Location, Venue, Date, Time, avrei perso alcuni concerti.

programsT %>%
  filter(programID == 10700)
## # A tibble: 16 × 11
##    id    progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date  Time  works 
##    <chr> <chr>   <chr>   <chr>    <int> <chr>   <chr>   <chr> <chr> <chr> <list>
##  1 dbe4… 10700   New Yo… 1950-…    8525 Special Manhat… Roxy… 1951… 12:4… <list>
##  2 dbe4… 10700   New Yo… 1950-…    8526 Special Manhat… Roxy… 1951… None  <list>
##  3 dbe4… 10700   New Yo… 1950-…    8527 Special Manhat… Roxy… 1951… None  <list>
##  4 dbe4… 10700   New Yo… 1950-…    8528 Special Manhat… Roxy… 1951… None  <list>
##  5 dbe4… 10700   New Yo… 1950-…    8529 Special Manhat… Roxy… 1951… 12:4… <list>
##  6 dbe4… 10700   New Yo… 1950-…    8530 Special Manhat… Roxy… 1951… None  <list>
##  7 dbe4… 10700   New Yo… 1950-…    8531 Special Manhat… Roxy… 1951… None  <list>
##  8 dbe4… 10700   New Yo… 1950-…    8532 Special Manhat… Roxy… 1951… None  <list>
##  9 dbe4… 10700   New Yo… 1950-…    8533 Special Manhat… Roxy… 1951… 12:4… <list>
## 10 dbe4… 10700   New Yo… 1950-…    8534 Special Manhat… Roxy… 1951… None  <list>
## 11 dbe4… 10700   New Yo… 1950-…    8535 Special Manhat… Roxy… 1951… None  <list>
## 12 dbe4… 10700   New Yo… 1950-…    8536 Special Manhat… Roxy… 1951… None  <list>
## 13 dbe4… 10700   New Yo… 1950-…    8537 Special Manhat… Roxy… 1951… None  <list>
## 14 dbe4… 10700   New Yo… 1950-…    8538 Special Manhat… Roxy… 1951… None  <list>
## 15 dbe4… 10700   New Yo… 1950-…    8539 Special Manhat… Roxy… 1951… None  <list>
## 16 dbe4… 10700   New Yo… 1950-…    8540 Special Manhat… Roxy… 1951… 12:4… <list>
## # … with abbreviated variable names ¹​programID, ²​orchestra, ³​concertID,
## #   ⁴​eventType, ⁵​Location

Ogni programma contiene un certo numero di lavori (works). In totale ci sono 125.224 works (si tratta di works che possono essere ripetuti in diversi programs).

programsT %>%
  mutate(works_L=sapply(works,length)) %>%
  count(works_L) %>%
  mutate(works_N = works_L * n) %>%
  mutate(works_T = cumsum(works_N)) %>%
  arrange(-works_T)
## # A tibble: 37 × 4
##    works_L     n works_N works_T
##      <int> <int>   <int>   <int>
##  1      50    13     650  125224
##  2      48     4     192  124574
##  3      46     5     230  124382
##  4      41     1      41  124152
##  5      36     1      36  124111
##  6      32     4     128  124075
##  7      30     7     210  123947
##  8      29     2      58  123737
##  9      28     6     168  123679
## 10      27     7     189  123511
## # … with 27 more rows

Ci sono dei program che hanno 0 works, che cosa sono? Si tratta di eventi che non prevedono musica o dei quali mancano informazioni.

programsT %>%
  mutate(works_L=sapply(works,length)) %>%
  filter(works_L == 0)
## # A tibble: 29 × 12
##    id    progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date  Time  works 
##    <chr> <chr>   <chr>   <chr>    <int> <chr>   <chr>   <chr> <chr> <chr> <list>
##  1 2a85… 13816   New Yo… 1894-…     577 Subscr… Manhat… Metr… 1895… 8:00… <NULL>
##  2 21fa… 12310   Stadiu… 1934-…    5697 Stadiu… Manhat… Grea… 1935… 8:30… <NULL>
##  3 e76c… 10203   Member… 1943-…    7229 Studen… Manhat… Hunt… 1944… 3:30… <NULL>
##  4 1b6c… 8607    NYP Pr… 1998-…   17636 Chamber Manhat… Alic… 1998… 8:00… <NULL>
##  5 e91b… 13779   Musici… 2007-…   19411 Musici… Manhat… Hull… 2007… 6:30… <NULL>
##  6 7112… 13665   Musici… 2012-…   20558 Chambe… Manhat… Doub… 2012… 8:30… <NULL>
##  7 3b66… 13666   NYP Pr… 2012-…   20700 Chambe… Manhat… St. … 2013… 7:00… <NULL>
##  8 26fe… 13667   Musici… 2012-…   20704 Chambe… Manhat… Racq… 2013… None  <NULL>
##  9 98f2… 13749   NYP Pr… 2013-…   20745 Chamber Manhat… The … 2013… 3:00… <NULL>
## 10 3dd4… 13341   NYP Pr… 2013-…   20767 Chambe… Manhat… None  2013… None  <NULL>
## # … with 19 more rows, 1 more variable: works_L <int>, and abbreviated variable
## #   names ¹​programID, ²​orchestra, ³​concertID, ⁴​eventType, ⁵​Location

Posso utilizzare unnest_longer, ogni lavoro associato a un programma darà origine a una riga. Devo usare l’opzione keep_empty TRUE per non perdere le righe con zero works. Quindi il tibble passerà a 125.224 + 29 righe, ovvero 125.253.

programsT <- programsT %>% unnest_longer(works, keep_empty = TRUE)
programsT %>% filter(programID == 13665)
## # A tibble: 1 × 11
##   id     progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date  Time  works 
##   <chr>  <chr>   <chr>   <chr>    <int> <chr>   <chr>   <chr> <chr> <chr> <list>
## 1 71124… 13665   Musici… 2012-…   20558 Chambe… Manhat… Doub… 2012… 8:30… <NULL>
## # … with abbreviated variable names ¹​programID, ²​orchestra, ³​concertID,
## #   ⁴​eventType, ⁵​Location

Ora works contiene liste di lunghezze diverse, a seconda delle caratteristiche del work compreso.

programsT %>%
  mutate(works_L=sapply(works,length)) %>%
  count(works_L)
## # A tibble: 5 × 2
##   works_L     n
##     <int> <int>
## 1       0    29
## 2       3 18489
## 3       4  5292
## 4       5 68041
## 5       6 33402

Le liste di lunghezza 3 sono quelle che contengono le intermissions.

programsT %>%
  mutate(works_L = sapply(works,length)) %>%
  filter(works_L == 3) %>%
  select(works) %>%
  unnest_wider(works) %>%
  count(ID, interval, soloists)
## # A tibble: 4 × 4
##   ID    interval            soloists     n
##   <chr> <chr>               <lgl>    <int>
## 1 0*    Intermission        NA       18394
## 2 1743* Intermission-Third  NA           1
## 3 4346* Intermission-Short  NA          54
## 4 7955* Intermission-Second NA          40

Cerco di capire cosa contengono le altre liste (di lunghezza 4, 5 e 6).

programsT %>%
  mutate(works_L = sapply(works,length)) %>%
  filter(works_L == 4) %>%
  select(works) %>%
  unnest_wider(works)
## # A tibble: 5,292 × 4
##    ID     composerName              workTitle soloists  
##    <chr>  <chr>                     <list>    <list>    
##  1 3642*  Hummel,  Johann           <chr [1]> <list [5]>
##  2 52425* Beethoven,  Ludwig  van   <chr [1]> <list [7]>
##  3 3712*  Bochsa,  Robert  N. C.    <chr [1]> <list [2]>
##  4 3764*  Lindpaintner,  Peter  Von <chr [1]> <list [5]>
##  5 3712*  Bochsa,  Robert  N. C.    <chr [1]> <list [2]>
##  6 3764*  Lindpaintner,  Peter  Von <chr [1]> <list [5]>
##  7 3894*  Traditional,              <chr [1]> <list [2]>
##  8 3468*  Haydn,  Franz  Joseph     <chr [1]> <list [1]>
##  9 3991*  Donizetti,  Gaetano       <chr [1]> <list [2]>
## 10 3999*  Hummel,  Johann           <chr [1]> <list [7]>
## # … with 5,282 more rows
programsT %>%
  mutate(works_L = sapply(works,length)) %>%
  filter(works_L == 5) %>%
  select(works) %>%
  unnest_wider(works)
## # A tibble: 68,041 × 6
##    ID     composerName              workTitle conductorName      soloi…¹ movem…²
##    <chr>  <chr>                     <list>    <chr>              <list>  <list> 
##  1 52446* Beethoven,  Ludwig  van   <chr [1]> Hill, Ureli Corel… <NULL>  <NULL> 
##  2 5543*  Kalliwoda,  Johann  W.    <chr [1]> Timm, Henry C.     <NULL>  <NULL> 
##  3 52437* Beethoven,  Ludwig  van   <chr [1]> Hill, Ureli Corel… <NULL>  <NULL> 
##  4 3659*  Romberg,  Bernhard        <chr [1]> Hill, Ureli Corel… <list>  <NULL> 
##  5 4567*  Hummel,  Johann           <chr [1]> Hill, Ureli Corel… <list>  <NULL> 
##  6 5150*  Pacini,  Giovanni         <chr [1]> Not conducted      <list>  <NULL> 
##  7 5161*  Romberg,  Bernhard        <chr [1]> Not conducted      <list>  <NULL> 
##  8 5166*  Thalberg,  Sigismond      <chr [1]> Not conducted      <list>  <NULL> 
##  9 5172*  Herz,  Henri              <chr [1]> Alpers, William    <list>  <NULL> 
## 10 5174*  Lindpaintner,  Peter  Von <chr [1]> Not conducted      <list>  <NULL> 
## # … with 68,031 more rows, and abbreviated variable names ¹​soloists, ²​movement
programsT %>%
  mutate(works_L = sapply(works,length)) %>%
  filter(works_L == 6) %>%
  select(works) %>%
  unnest_wider(works)
## # A tibble: 33,402 × 6
##    ID      composerName               workTitle          movem…¹ condu…² soloi…³
##    <chr>   <chr>                      <chr>              <list>  <chr>   <list> 
##  1 8834*4  Weber,  Carl  Maria Von    OBERON             <chr>   Timm, … <list> 
##  2 8834*3  Weber,  Carl  Maria Von    OBERON             <chr>   Etienn… <NULL> 
##  3 8835*1  Rossini,  Gioachino        ARMIDA             <chr>   Timm, … <list> 
##  4 8837*6  Beethoven,  Ludwig  van    FIDELIO, OP. 72    <chr>   Timm, … <list> 
##  5 8336*4  Mozart,  Wolfgang  Amadeus ABDUCTION FROM TH… <chr>   Timm, … <list> 
##  6 8838*2  Bellini,  Vincenzo         I PURITANI         <chr>   Hill, … <list> 
##  7 8839*2  Rossini,  Gioachino        WILLIAM TELL       <chr>   Alpers… <NULL> 
##  8 53076*2 Rossini,  Gioachino        STABAT MATER       <chr>   Alpers… <list> 
##  9 51568*2 Hummel,  Johann            CONCERTO, PIANO, … <chr>   Alpers… <list> 
## 10 51568*3 Hummel,  Johann            CONCERTO, PIANO, … <chr>   Alpers… <list> 
## # … with 33,392 more rows, and abbreviated variable names ¹​movement,
## #   ²​conductorName, ³​soloists

Faccio unnest_wider di works. Passo da 11 colonne a 17 (al posto di works si aggiungono tutte le colonne comuni ai diversi works, ovvero ID, composerName, workTitle, movement, conductorName, soloists, interval)

programsT <- programsT %>% unnest_wider(works)

Passo a prendere in considerazione workTitle.

programsT %>%
  mutate(workTitle_L = sapply(workTitle,length)) %>%
  count(workTitle_L)
## # A tibble: 3 × 2
##   workTitle_L      n
##         <int>  <int>
## 1           0  18518
## 2           1 106723
## 3           2     12

Come sono gli elementi di lunghezza 2.

programsT %>%
  mutate(workTitle_L = sapply(workTitle, length)) %>%
  filter(workTitle_L == 2) %>%
  pull(workTitle) %>%
  head(3)
## [[1]]
## [[1]]$`_`
## [1] ", PROCESSION OF THE KNIGHTS OF THE HOLY GRAIL"
## 
## [[1]]$em
## [1] "PARSIFAL"
## 
## 
## [[2]]
## [[2]]$`_`
## [1] "CHORUS OF VILLAGERS FROM  (ARR. CLARINET ENS.) (ARR. Bellison)"
## 
## [[2]]$em
## [1] "PRINCE IGOR"
## 
## 
## [[3]]
## [[3]]$`_`
## [1] "AIR FROM  (ARR. CLARINET ENS.) (ARR. Bellison)"
## 
## [[3]]$em
## [1] "SUITE IN D MAJOR"

Mi salvo i program ID per ritrovarli dopo la modifica.

wt2ids <- programsT %>%
  mutate(workTitle_L = sapply(workTitle, length)) %>%
  filter(workTitle_L == 2) %>%
  select(ID)

Applico funzione per sistemarli (sistemo anche NA al posto di NULL), faccio semi_join con gli id per verificare il risultato su quelli di lunghezza 2.

mod_w <- function(x) {
  if (is.null(x)) return (NA)
  if (length(x) == 1) return (x[[1]])
  if (length(x) == 2) return (combine_w(x[[1]], x[[2]]))
}

combine_w <- function(x,y) {
  if (startsWith(x,",")) {
    return (paste(y,x))
  } else {
    return (paste(x,y))
  }
}

programsT %>%
  mutate(workTitle = sapply(workTitle, mod_w)) %>%
  semi_join(wt2ids) %>%
  select(workTitle)
## Joining with `by = join_by(ID)`
## # A tibble: 12 × 1
##    workTitle                                                                 
##    <chr>                                                                     
##  1 PARSIFAL , PROCESSION OF THE KNIGHTS OF THE HOLY GRAIL                    
##  2 CHORUS OF VILLAGERS FROM  (ARR. CLARINET ENS.) (ARR. Bellison) PRINCE IGOR
##  3 AIR FROM  (ARR. CLARINET ENS.) (ARR. Bellison) SUITE IN D MAJOR           
##  4 ANDANTE FROM  (ARR. CLARINET ENS.) (ARR. Bellison) SURPRISE SYMPHONY      
##  5 OVERTURE, DI BALLO                                                        
##  6 OVERTURE, DI BALLO                                                        
##  7 OVERTURE, DI BALLO                                                        
##  8 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS                 
##  9 CONCERT FANTASY ON  FOR VIOLIN AND PIANO (ARR. Sarasate) CARMEN           
## 10 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS                 
## 11 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS                 
## 12 QUARTET, STRING, OP. 76, NO. 2, D MINOR, H.III:76, FIFTHS

Eseguo l’operazione e ri-assegno a programsT.

programsT <- programsT %>%
  mutate(workTitle = sapply(workTitle, mod_w))

Verifico la lunghezza degli elementi contenuti in movement.

programsT %>%
  mutate(movement_L = sapply(movement, length)) %>%
  count(movement_L)
## # A tibble: 3 × 2
##   movement_L     n
##        <int> <int>
## 1          0 89759
## 2          1 35346
## 3          2   148

Vedo come sono quelli di lunghezza 2.

programsT %>%
  mutate(movement_L = sapply(movement, length)) %>%
  filter(movement_L == 2)
## # A tibble: 148 × 18
##    id     progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date  Time  ID   
##    <chr>  <chr>   <chr>   <chr>    <int> <chr>   <chr>   <chr> <chr> <chr> <chr>
##  1 6a02e… 1907    New Yo… 1849-…      33 Subscr… Manhat… Apol… 1850… 8:00… 5801…
##  2 6a02e… 1907    New Yo… 1849-…      33 Subscr… Manhat… Apol… 1850… 8:00… 5801…
##  3 6f3bc… 11474   Musici… 1876-…     172 Special Manhat… Stei… 1876… 8:00… 6529…
##  4 d3eff… 871     New Yo… 1879-…     228 Subscr… Manhat… Acad… 1880… 8:00… 5801…
##  5 d3eff… 871     New Yo… 1879-…     228 Subscr… Manhat… Acad… 1880… 8:00… 5801…
##  6 d4145… 9023    New Yo… 1898-…     695 Young … Manhat… Carn… 1899… 2:30… 6529…
##  7 09a41… 10541   New Yo… 1903-…     908 Young … Manhat… Carn… 1904… None  6529…
##  8 59816… 10512   New Yo… 1905-…     987 Subscr… Manhat… Carn… 1905… 2:30… 1063…
##  9 604c7… 10440   New Yo… 1913-…    1688 Young … Manhat… Carn… 1914… None  1050…
## 10 604c7… 10440   New Yo… 1913-…    1688 Young … Manhat… Carn… 1914… None  1050…
## # … with 138 more rows, 7 more variables: composerName <chr>, workTitle <chr>,
## #   conductorName <chr>, soloists <list>, movement <list>, interval <chr>,
## #   movement_L <int>, and abbreviated variable names ¹​programID, ²​orchestra,
## #   ³​concertID, ⁴​eventType, ⁵​Location

Mi limito a concatenarli, con la funzione mod_m. Inoltre sostituisco i NULL con NA.

mod_m <- function(x) {
  if (is.null(x)) return (NA)
  if (length(x) == 1) return (x[[1]])
  if (length(x) == 2) return (paste(x[[1]], "_", x[[2]]))
}

programsT <- programsT %>% mutate(movement = sapply(movement, mod_m))

Controllo la lunghezza delle liste contenute in soloists. Ho 89.321 potenziali righe di informazioni su solisti + 85.122 righe nulle.

programsT %>%
  mutate(soloists_L = sapply(soloists, length)) %>%
  count(soloists_L) %>%
  mutate(soloists_N = soloists_L * n) %>%
  mutate(soloists_T = cumsum(soloists_N)) %>%
  filter(soloists_L == 0 | soloists_L == max(soloists_L))
## # A tibble: 2 × 4
##   soloists_L     n soloists_N soloists_T
##        <int> <int>      <int>      <int>
## 1          0 85122          0          0
## 2         84     1         84      89321

Provo unnest_longer di soloists con opzione keep_empty per tenere anche le righe che contengono liste NULL e vedo che sono di lunghezza 0 o 3.

programsT %>%
  unnest_longer(soloists, keep_empty = TRUE) %>%
  mutate(soloists_L = sapply(soloists, length)) %>%
  count(soloists_L)
## # A tibble: 2 × 2
##   soloists_L     n
##        <int> <int>
## 1          0 85127
## 2          3 89316

Ce ne sono cinque che sono di lunghezza 1, ma poi NULL.

programsT %>%
  mutate(soloists_L = sapply(soloists, length)) %>%
  filter(soloists_L != 0) %>%
  unnest_longer(soloists, keep_empty = TRUE) %>%
  mutate(soloists_L2 = sapply(soloists, length)) %>%
  filter(soloists_L2 == 0)
## # A tibble: 5 × 19
##   id      progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date  Time  ID   
##   <chr>   <chr>   <chr>   <chr>    <int> <chr>   <chr>   <chr> <chr> <chr> <chr>
## 1 0afec4… 14489   Musici… 2019-…   22157 Virtua… Online… YouT… 2020… 7:40… 6979*
## 2 b64d6e… 14552   Musici… 2019-…   22165 Virtua… Online… YouT… 2020… 1:00… 5276…
## 3 23ee98… 14586   Musici… 2019-…   22171 Virtua… Online… Inst… 2020… 1:00… 1319…
## 4 70f296… 14477   Musici… 2019-…   22178 Virtua… Online… YouT… 2020… 1:20… 1320…
## 5 398d6d… 14470   Musici… 2019-…   22181 Virtua… Online… YouT… 2020… 10:4… 5007…
## # … with 8 more variables: composerName <chr>, workTitle <chr>,
## #   conductorName <chr>, soloists <list>, movement <chr>, interval <chr>,
## #   soloists_L <int>, soloists_L2 <int>, and abbreviated variable names
## #   ¹​programID, ²​orchestra, ³​concertID, ⁴​eventType, ⁵​Location

Per esempio.

programsT %>%
  filter(programID == 14489) %>%
  pull(soloists)
## [[1]]
## [[1]][[1]]
## NULL

Faccio quindi unnest_longer, poi unnest_wider, il numero di colonne passa da 17 a 19 (soloistName, soloistInstrument e soloistRoles al posto di soloists).

programsT <- programsT %>%
  unnest_longer(soloists, keep_empty = TRUE) %>%
  unnest_wider(soloists)
programsT
## # A tibble: 174,443 × 19
##    id     progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date  Time  ID   
##    <chr>  <chr>   <chr>   <chr>    <int> <chr>   <chr>   <chr> <chr> <chr> <chr>
##  1 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 5244…
##  2 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 8834…
##  3 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
##  4 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
##  5 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
##  6 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
##  7 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 3642*
##  8 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 0*   
##  9 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 8834…
## 10 00646… 3853    New Yo… 1842-…       1 Subscr… Manhat… Apol… 1842… 8:00… 8835…
## # … with 174,433 more rows, 8 more variables: composerName <chr>,
## #   workTitle <chr>, conductorName <chr>, soloistName <chr>,
## #   soloistInstrument <chr>, soloistRoles <chr>, movement <chr>,
## #   interval <chr>, and abbreviated variable names ¹​programID, ²​orchestra,
## #   ³​concertID, ⁴​eventType, ⁵​Location

id rappresenta il GUID, che fa parte dell’indirizzo a cui è possibile vedere il programma online (archives.nyphil.org/index.php/artifact/GUID/fullview). ci sono tre programID che sono associati a più GUID. Tengo comunque la colonna id.

ids <- programsT %>%
  distinct(id, programID) %>%
  group_by(programID) %>%
  count(sort = TRUE) %>%
  filter(n>1) %>%
  select(programID) %>%
  ungroup()

semi_join(programsT, ids) %>%
  distinct(id, programID) %>%
  select(id, programID)
## Joining with `by = join_by(programID)`
## # A tibble: 8 × 2
##   id                                       programID
##   <chr>                                    <chr>    
## 1 af869073-9643-4dab-b182-948e3f2e6ab9-0.1 8950     
## 2 f4cf3522-7910-4e90-9978-ab70ac615ae4-0.1 8950     
## 3 b31c88f2-1774-4f8b-94cf-835952c65175-0.1 8950     
## 4 38283dce-9333-46c4-828a-54c0ad957c7b-0.1 10525    
## 5 ee1431f7-6f25-4e5f-b815-4be9d743cf03-0.1 10525    
## 6 8a73c52f-95a8-411c-b6e8-3d7bc4d7b90a-0.1 10525    
## 7 f4fd303d-46c7-4233-b087-0d2f5f91cc7b-0.1 5358     
## 8 b533775d-c639-4f79-9838-f88a88f79e95-0.1 5358

Correggo come è scritta la data, aggiungo anche componenti anno, mese, giorno della data.

#ci sono 5591 righe che contengono None invece dell'orario in formato AM o PM
programsT %>%
  mutate(trovato = str_detect(programsT$Time, "\\d{1,2}:\\d{1,2}[P,A]M")) %>%
  filter(trovato == FALSE)
## # A tibble: 5,591 × 20
##    id     progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date  Time  ID   
##    <chr>  <chr>   <chr>   <chr>    <int> <chr>   <chr>   <chr> <chr> <chr> <chr>
##  1 09581… 305     New Yo… 1843-…       5 Subscr… Manhat… Apol… 1843… None  5245…
##  2 09581… 305     New Yo… 1843-…       5 Subscr… Manhat… Apol… 1843… None  3677…
##  3 09581… 305     New Yo… 1843-…       5 Subscr… Manhat… Apol… 1843… None  0*   
##  4 09581… 305     New Yo… 1843-…       5 Subscr… Manhat… Apol… 1843… None  8955…
##  5 09581… 305     New Yo… 1843-…       5 Subscr… Manhat… Apol… 1843… None  5190…
##  6 09581… 305     New Yo… 1843-…       5 Subscr… Manhat… Apol… 1843… None  5072…
##  7 8025e… 4226    New Yo… 1843-…       7 Subscr… Manhat… Apol… 1844… None  3707*
##  8 8025e… 4226    New Yo… 1843-…       7 Subscr… Manhat… Apol… 1844… None  3712*
##  9 8025e… 4226    New Yo… 1843-…       7 Subscr… Manhat… Apol… 1844… None  3712*
## 10 8025e… 4226    New Yo… 1843-…       7 Subscr… Manhat… Apol… 1844… None  0*   
## # … with 5,581 more rows, 9 more variables: composerName <chr>,
## #   workTitle <chr>, conductorName <chr>, soloistName <chr>,
## #   soloistInstrument <chr>, soloistRoles <chr>, movement <chr>,
## #   interval <chr>, trovato <lgl>, and abbreviated variable names ¹​programID,
## #   ²​orchestra, ³​concertID, ⁴​eventType, ⁵​Location
# lascio Time com'è, sistemo la data, tenendo anche anno, mese, giorno
programsT <- programsT %>%
  mutate(Date = str_extract(Date, "[[:digit:]-]+")) %>%
  mutate(Date = ymd(Date)) %>%
  mutate(Year = year(Date)) %>%
  mutate(Month = month(Date)) %>%
  mutate(Day = day(Date)) %>%
  select(id, programID, orchestra, season, concertID, eventType, Location, Venue, Date, Year, Month, Day, Time, ID, composerName, workTitle, conductorName, soloistName, soloistInstrument, soloistRoles, movement, interval)

Divido ID in workID e movementID

programsT <- programsT %>%
  separate(ID, into = c("workID", "movID"), sep = "\\*") %>%
  mutate(workID = as.integer(workID)) %>%
  mutate(movID = as.integer(movID))

Ci sono delle celle con valori multipli in conductorName.

programsT %>%
  mutate(co_conductor = str_detect(conductorName, ";")) %>%
  filter(co_conductor)
## # A tibble: 502 × 24
##    id      progr…¹ orche…² season conce…³ event…⁴ Locat…⁵ Venue Date        Year
##    <chr>   <chr>   <chr>   <chr>    <int> <chr>   <chr>   <chr> <date>     <dbl>
##  1 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
##  2 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
##  3 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
##  4 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
##  5 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
##  6 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
##  7 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
##  8 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
##  9 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
## 10 1b7479… 10581   New Yo… 1847-…      24 Special Manhat… Cast… 1848-02-05  1848
## # … with 492 more rows, 14 more variables: Month <dbl>, Day <int>, Time <chr>,
## #   workID <int>, movID <int>, composerName <chr>, workTitle <chr>,
## #   conductorName <chr>, soloistName <chr>, soloistInstrument <chr>,
## #   soloistRoles <chr>, movement <chr>, interval <chr>, co_conductor <lgl>, and
## #   abbreviated variable names ¹​programID, ²​orchestra, ³​concertID, ⁴​eventType,
## #   ⁵​Location

Correggo un errore (un conductorName scritto come “de Waart, Edo; ; ; de Waart, Edo”).

programsT <- programsT %>%
  mutate(conductorName = if_else(str_detect(conductorName, ";\\s*;\\s*;"), "de Waart, Edo", conductorName))

Correggo errori nei nomi dei compositori.

programsT <- programsT %>%
  mutate(composerName = str_replace_all(composerName, "\\s+", " "))

Modifico aggiungendo una colonna che indica la presenza di co-conduzione.

programsT <- programsT %>%
  mutate(conductorName = str_remove(conductorName, "^; ")) %>%
  mutate(co_conductor = str_detect(conductorName, ";")) %>%
  separate_longer_delim(conductorName, "; ")

Costruisco un tibble ridotto per le analisi successive.

performances <- programsT %>%
  filter(is.na(interval)) %>%
  distinct(concertID, Date, composerName, workTitle, conductorName, co_conductor, orchestra, eventType, Location, season) %>%
  mutate(seasonYear = sapply(season,function (x) as.integer(str_extract(x, "[[:digit:]]+")))) %>%
  ungroup()

remove(programsT)

Orchestre presenti nel database.

ggplot(performances) +
  geom_bar(aes(y = orchestra))

Orchestre nel tempo.

ggplot(performances) +
  geom_point(aes(seasonYear, orchestra))

Andamento numero performance nelle stagioni.

performances %>%
  group_by(seasonYear) %>%
  count() %>%
  ggplot(mapping = aes(x = seasonYear, y = n)) + 
  geom_point()

Che tipi di performance ci sono. Mi limito alle categorie che contengono più di 100 performance.

performances %>%
  group_by(eventType) %>%
  count() %>%
  filter(n > 100) %>%
  ggplot(aes(x = n, y = eventType)) +
  geom_bar(stat = "identity")

Tipologie eventi New York Philharmonic.

performances %>%
  filter(orchestra == "New York Philharmonic") %>%
  mutate(group = if_else(eventType == "Subscription Season", "SUB", "REST")) %>%
  group_by(seasonYear,group) %>%
  count() %>%
  ggplot(aes(x = seasonYear, y = n, colour = group)) +
  geom_point()

Quali sono gli eventType preponderanti dalla stagione 2000-2001 in poi non di tipo Subscription Season?

performances %>%
  filter(orchestra == "New York Philharmonic") %>%
  filter(seasonYear >= 2000) %>%
  filter(eventType != "Subscription Season") %>%
  group_by(seasonYear,eventType) %>%
  count() %>%
  ggplot(aes(x = seasonYear, y = n, color = eventType)) +
  geom_line(alpha = 1/2, show.legend = FALSE)

Fra gli eventi che non fanno parte della stagione regolare, quelli presenti con maggior continuità sono tour e concerti non-subscription.

performances %>%
  filter(orchestra == "New York Philharmonic") %>%
  filter(seasonYear >= 2000) %>%
  filter(eventType != "Subscription Season") %>%
  ggplot(aes(y=eventType)) +
  geom_bar()

performances %>%
  filter(orchestra == "New York Philharmonic") %>%
  filter(seasonYear >= 2000) %>%
  filter(eventType != "Subscription Season") %>%
  group_by(seasonYear,eventType) %>%
  count() %>%
  group_by(seasonYear) %>%
  mutate(seasonEvents = sum(n)) %>%
  ungroup() %>%
  mutate(perc = n / seasonEvents) %>%
  group_by(seasonYear) %>%
  filter(perc > 0.2) %>%
  ungroup() %>%
  ggplot() +
  geom_point(aes(seasonYear, perc, color = eventType)) +
  geom_line(aes(seasonYear, perc, color = eventType))

performances %>%
  filter(orchestra == "New York Philharmonic") %>%
  filter(seasonYear >= 2000) %>%
  filter(eventType != "Subscription Season") %>%
  group_by(seasonYear,eventType) %>%
  count() %>%
  group_by(seasonYear) %>%
  mutate(seasonEvents = sum(n)) %>%
  ungroup() %>%
  mutate(perc = n / seasonEvents) %>%
  group_by(seasonYear) %>%
  filter(eventType == "Bandwagon")
## # A tibble: 1 × 5
## # Groups:   seasonYear [1]
##   seasonYear eventType     n seasonEvents  perc
##        <int> <chr>     <int>        <int> <dbl>
## 1       2020 Bandwagon    47          131 0.359

Da ora in poi mi limito a considerare New York Philharmonic. 69759 performance.

performances <- performances %>%
filter(orchestra == "New York Philharmonic")

Si possono trovare dei trend che ci permettano di dire qualcosa sulla popolarità di alcuni compositori nel corso del tempo? Qui sotto un grafico con stagioni - numero di performance per compositore.

pcp <- performances %>%
  group_by(seasonYear, composerName) %>%
  count() %>%
  group_by(seasonYear) %>%
  mutate(totalSeason = sum(n)) %>%
  ungroup() %>%
  mutate(perc = n / totalSeason)

saveRDS(pcp, file = "pcp.RDS")

ggplot(pcp,aes(seasonYear, n, color = composerName)) +
  geom_line(alpha = 1/4, show.legend = FALSE)

Questo con le percentuali. Per ogni compositore, la percentuale di performance del compositore rispetto al totale della stagione.

ggplot(pcp,aes(seasonYear, perc, color = composerName)) +
  geom_line(alpha = 1/4, show.legend = FALSE)

Costruisco un modello quadratico per tutti i compositori (dipendente percentuale, indipendenti stagione e stagione al quadrato).

pcp <- pcp %>%
  mutate(seasonYear_2 = seasonYear**2)

pcp_nested <- pcp %>%
  group_by(composerName) %>%
  nest()

pcp_model <- function(df) {
  lm(perc ~ seasonYear + seasonYear_2, data = df)
}

pcp_nested <- pcp_nested %>%
  mutate(model = map(data, pcp_model))

pcp_nested <- pcp_nested %>%
  mutate(data = map2(data, model, add_residuals))
## Warning: There were 1354 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `data = map2(data, model, add_residuals)`.
## ℹ In group 1: `composerName = "ACT,"`.
## Caused by warning in `predict.lm()`:
## ! prediction from a rank-deficient fit may be misleading
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 1353 remaining warnings.
pcp_nested <- pcp_nested %>% 
  mutate(glance = map(model, broom::glance))

pcp_models_simp <- pcp_nested %>%
  mutate(model = map(model, tidy)) %>%
  unnest_longer(model) %>%
  unnest_wider(model) %>%
  select(composerName,term,estimate) %>%
  pivot_wider(names_from = term,values_from = estimate)

saveRDS(pcp_models_simp, file = "pcp_models_simp.RDS")

pcp_glance <- unnest(pcp_nested, glance, names_sep = "_")

saveRDS(pcp_glance, file = "pcp_glance.RDS")

pcp_resids <- unnest(pcp_nested, data)

saveRDS(pcp_resids, file = "pcp_resids.RDS")

Compositori meno di 26 stagioni.

atleast26 <- performances %>%
  group_by(composerName) %>%
  summarise(seasons = n_distinct(seasonYear)) %>%
  filter(seasons <= 25)

Modelli in cui tutte e due le componenti sono positive. Non ci sono.

up_up <- pcp_models_simp %>%
  filter(seasonYear > 0, seasonYear_2 > 0) %>%
  select(composerName) %>%
  head(20)

Modelli in cui tutte e due le componenti sono negative. Non ci sono.

down_down <- pcp_models_simp %>%
  filter(seasonYear<0,seasonYear_2<0) %>%
  select(composerName) %>%
  head(20)

Modelli in cui la componente quadratica è positiva e la componente lineare è negativa. Compositori con almeno 26 stagioni.

down_up <- pcp_models_simp %>%
  filter(seasonYear<0,seasonYear_2>0) %>%
  select(composerName)

pcp %>%
  anti_join(atleast26) %>%
  semi_join(down_up) %>%
  ggplot(aes(x = seasonYear, y = perc)) +
  geom_point(alpha = 1/4) +
  facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
## Joining with `by = join_by(composerName)`

Modelli in cui la componente quadratica è negativa e in cui la componente lineare è positiva. Compositori con almeno 26 stagioni.

up_down <- pcp_models_simp %>%
  filter(seasonYear>0,seasonYear_2<0) %>%
  select(composerName)

pcp %>%
  anti_join(atleast26) %>%
  semi_join(up_down) %>%
  ggplot(aes(x = seasonYear, y = perc)) +
  geom_point(alpha = 1/4) +
  facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
## Joining with `by = join_by(composerName)`

Residui.

pcp_resids %>%
  ggplot(aes(resid)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot dei residui nel tempo.

ggplot(pcp_resids) +
  geom_point(aes(seasonYear, resid), alpha = 1/4, show.legend = FALSE)

Guardo ai compositori che hanno residui più grandi di 0.1.

pcp_highres <- pcp_resids %>%
  filter(resid > 0.1 | resid < -0.1) %>%
  distinct(composerName)

pcp %>%
  inner_join(pcp_highres) %>%
  ggplot() +
  geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
  facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`

Modelli che fittano abbastanza bene. Compositori con almeno 26 stagioni.

pcp %>%
  anti_join(atleast26) %>%
  semi_join(
    pcp_glance %>%
    filter(glance_r.squared > 0.6) %>%
    select(composerName)
  ) %>%
  ggplot() +
  geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
  facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`
## Joining with `by = join_by(composerName)`

Compositori presenti in almeno 80 stagioni, con densità 70 %. Sono 25 compositori, responsabili per il 57 % delle performance.

over80 <- performances %>%
  group_by(composerName) %>%
  mutate(
    seasons = n_distinct(seasonYear),
    first = min(seasonYear),
    last = max(seasonYear),
    span = last - first + 1,
    density = seasons / span) %>%
  filter(seasons > 80 & density > 0.70) %>%
  distinct(composerName)

performances %>%
  mutate (totali = n()) %>%
  inner_join(over80) %>%
  mutate (over80 = n(), perc = over80 / totali) %>%
  distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
##    perc
##   <dbl>
## 1 0.574

Solo gli over 80.

ggplot(
  pcp %>%
  inner_join(over80)
) +
  geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
  facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`

Simile, più di 40 stagioni, con densità del 70%.

over40 <- performances %>%
  group_by(composerName) %>%
  mutate(
    seasons = n_distinct(seasonYear),
    first = min(seasonYear),
    last = max(seasonYear),
    span = last - first + 1,
    density = seasons / span) %>%
  filter(seasons >= 40 & density > 0.70) %>%
  distinct(composerName)

performances %>%
  mutate (totali = n()) %>%
  inner_join(over40) %>%
  mutate (over40 = n(), perc = over40 / totali) %>%
  distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
##    perc
##   <dbl>
## 1 0.639

Over 40.

ggplot(
  pcp %>%
  inner_join(over40)
) +
  geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
  facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`

Gini compositori.

library(ineq)

gini_comp <- pcp %>%
  select(seasonYear,n) %>%
  group_by(seasonYear) %>%
  mutate(gini = Gini(n)) %>%
  distinct(seasonYear,gini)

ggplot(gini_comp) +
    geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE) +
    geom_line(aes(seasonYear,gini))

Un altro subset, compositori rappresentati prima della stagione 1909, per almeno 8 stagioni. Rappresentano il 48 % delle performance totali.

before_1909 <- performances %>%
  group_by(composerName) %>%
  filter(seasonYear < 1909) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons >= 8) %>%
  distinct(composerName)

saveRDS(before_1909, file = "before_1909")

performances %>%
  mutate (totali = n()) %>%
  inner_join(before_1909) %>%
  mutate (before_1909 = n(), perc = before_1909 / totali) %>%
  distinct(perc)
## Joining with `by = join_by(composerName)`
## # A tibble: 1 × 1
##    perc
##   <dbl>
## 1 0.478

Compositori rappresentati prima della stagione 1909-1910 per almeno 8 stagioni.

ggplot(
  pcp %>%
  inner_join(before_1909)
  ) +
  geom_point(aes(x = seasonYear, y = perc),alpha = 1/4) +
  geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE) +
  facet_wrap(~composerName)
## Joining with `by = join_by(composerName)`

Numero di compositori per stagione.

performances %>%
  group_by(seasonYear) %>%
  mutate(composers = n_distinct(composerName)) %>%
  ungroup() %>%
  ggplot() +
  geom_point(aes(seasonYear,composers)) +
  geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE)

Stagioni che hanno Gini compositori maggiore di 0.5.

pcp %>%
  inner_join(
    gini_comp %>%
  filter(gini > 0.5)
  ) %>%
  ggplot(aes(seasonYear,perc,color=composerName)) +
  geom_point(show.legend = FALSE)
## Joining with `by = join_by(seasonYear)`

Stagioni che hanno Gini compositori maggiore di 0.5. Compositori che nella stagione hanno più dell’10% delle performance.

pcp %>%
  inner_join(
    gini_comp %>%
  filter(gini > 0.5)
  ) %>%
  filter(perc > 0.1) %>%
  ggplot(aes(seasonYear,perc,color=composerName)) +
  geom_point(show.legend = TRUE)
## Joining with `by = join_by(seasonYear)`

Mediana percentuali per stagione.

pcp %>%
  group_by(seasonYear) %>%
  mutate(median = median(perc)) %>%
  ggplot() +
  geom_line(aes(seasonYear,median)) +
  geom_vline(aes(xintercept = 1909, color = "red"), show.legend = FALSE) +
  annotate("text", x = 1917, y = 0.06, label="1909", angle=0)

Si possono individuare nei trend all’interno dell’evoluzione dei repertori dei conduttori? Modello quadratico.

ccn2 <- performances %>%
  filter(!is.na(conductorName)) %>%
  filter(conductorName != "Not conducted") %>%
  group_by(seasonYear,conductorName,composerName) %>%
  mutate(comp_cond_season = n()) %>%
  group_by(seasonYear,conductorName) %>%
  mutate(cond_season = n()) %>%
  mutate(perc = comp_cond_season / cond_season) %>%
  group_by(conductorName,composerName) %>%
  mutate(seasonYear_2 = seasonYear**2) %>%
  ungroup() %>%
  distinct(seasonYear,seasonYear_2,conductorName,composerName,perc)

saveRDS(ccn2, file = "ccn2.RDS")

ccn2_nested <- ccn2 %>%
  group_by(conductorName,composerName) %>%
  nest()

ccn2_func <- function(df) {
  lm(perc ~ seasonYear + seasonYear_2, data = df)
}

regs <- ccn2_nested %>%
  mutate(
    model = map(data, ccn2_func),
    tidied = map(model, tidy),
    glanced = map(model, glance),
    augmented = map(model, augment)
  )
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `tidied = map(model, tidy)`.
## ℹ In group 7377: `conductorName = "Stahlberg, Fritz"`, `composerName =
##   "Stahlberg, Fritz"`.
## Caused by warning in `summary.lm()`:
## ! essentially perfect fit: summary may be unreliable
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 2 remaining warnings.
saveRDS(regs, file = "regs.RDS")

Modelli “molto buoni” e “buoni”.

very_good <- regs %>%
  unnest(glanced) %>%
  filter(r.squared >= 0.9) %>%
select(conductorName, composerName)

good <- regs %>%
  unnest(glanced) %>%
  filter(r.squared >= 0.8) %>%
select(conductorName, composerName)

Come sono distribuiti i residui.

regs %>%
  unnest(augmented) %>%
  select(.resid) %>%
  ggplot() +
    geom_histogram(aes(.resid))
## Adding missing grouping variables: `conductorName`, `composerName`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Seleziono quelli più alti di 0.2

highres <- regs %>%
  unnest(augmented) %>%
  filter(.resid > 0.2 | .resid < -0.2) %>%
  distinct(conductorName,composerName,seasonYear,.resid)

Direttori-compositori con fit buono per coppie che compaiono almeno per 5 stagioni.

ccn2 %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons >= 5) %>%
  inner_join(good) %>%
  ggplot() +
  geom_line(aes(seasonYear,perc,color=conductorName), show.legend = FALSE) +
  geom_point(aes(seasonYear,perc,color=conductorName), show.legend = FALSE) +
  facet_wrap(~conductorName)
## Joining with `by = join_by(conductorName, composerName)`

Bernstein, fit buono. Almeno tre stagioni.

ccn2 %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons > 2) %>%
  filter(conductorName == "Bernstein, Leonard") %>%
  inner_join(good) %>%
  ggplot() +
  geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`

Bernstein non nel fit buono. Almeno tre stagioni

ccn2 %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons > 2) %>%
  filter(conductorName == "Bernstein, Leonard") %>%
  anti_join(good) %>%
  ggplot() +
  geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`

Masur, fit buono. Almeno tre stagioni.

ccn2 %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons > 2) %>%
  filter(conductorName == "Masur, Kurt") %>%
  inner_join(good) %>%
  ggplot() +
  geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`

Masur, non nel fit buono. Almeno tre stagioni.

ccn2 %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons > 2) %>%
  filter(conductorName == "Masur, Kurt") %>%
  anti_join(good) %>%
  ggplot() +
  geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`

Boulez, fit buono. Almeno tre stagioni.

ccn2 %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons > 2) %>%
  filter(conductorName == "Boulez, Pierre") %>%
  inner_join(good) %>%
  ggplot() +
  geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`

Boulez, non nel fit buono. Almeno tre stagioni.

ccn2 %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons > 2) %>%
  filter(conductorName == "Boulez, Pierre") %>%
  anti_join(good) %>%
  ggplot() +
  geom_line(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  geom_point(aes(seasonYear,perc,color=composerName), show.legend = FALSE) +
  facet_wrap(~composerName)
## Joining with `by = join_by(conductorName, composerName)`

remove(ccn2)
remove(regs)

Modello lineare. Per ogni direttore-compositore modello, in base all’anno, la variazione della percentuale delle performance di quel compositore nelle performance totali del direttore nell’anno.

ccn <- performances %>%
  filter(!is.na(conductorName)) %>%
  filter(conductorName != "Not conducted") %>%
  group_by(seasonYear,conductorName,composerName) %>%
  mutate(comp_cond_season = n()) %>%
  group_by(seasonYear,conductorName) %>%
  mutate(cond_season = n()) %>%
  mutate(perc = comp_cond_season / cond_season) %>%
  group_by(conductorName,composerName) %>%
  ungroup() %>%
  distinct(seasonYear,conductorName,composerName,perc)

saveRDS(ccn, file = "ccn.RDS")

ccn_nested <- ccn %>%
  group_by(conductorName,composerName) %>%
  nest()

ccn_func <- function(df) {
  lm(perc ~ seasonYear, data = df)
}

lin_regs <- ccn_nested %>%
  mutate(
    model = map(data, ccn_func),
    tidied = map(model, tidy),
    glanced = map(model, glance),
    augmented = map(model, augment)
  )
## Warning: There were 15 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `tidied = map(model, tidy)`.
## ℹ In group 3417: `conductorName = "Jansons, Mariss"`, `composerName = "Rossini,
##   Gioachino"`.
## Caused by warning in `summary.lm()`:
## ! essentially perfect fit: summary may be unreliable
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 14 remaining warnings.
saveRDS(lin_regs, file = "lin_regs.RDS")

Modelli con R^2 sopra 0.9 e sopra 0.8.

very_good_lin <- lin_regs %>%
  unnest(glanced) %>%
  filter(r.squared >= 0.9) %>%
select(conductorName, composerName)

good_lin <- lin_regs %>%
  unnest(glanced) %>%
  filter(r.squared >= 0.8) %>%
select(conductorName, composerName)

Modelli con R^2 sopra 0.8 (coppie direttori-compositori con almeno cinque stagioni).

ccn %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons >= 5) %>%
  inner_join(good_lin) %>%
  ggplot() +
  geom_line(aes(seasonYear,perc,color=conductorName), show.legend = FALSE) +
  geom_point(aes(seasonYear,perc,color=conductorName), show.legend = FALSE) +
  facet_wrap(~conductorName)
## Joining with `by = join_by(conductorName, composerName)`

Bernstein modelli lineari positivi e negativi.

BernsteinHeadNeg <- lin_regs %>%
  unnest(tidied) %>%
  select(conductorName,composerName,term,estimate) %>%
  pivot_wider(names_from = term,values_from = estimate) %>%
  select(conductorName,composerName,seasonYear) %>%
  filter(!is.na(seasonYear)) %>%
  filter(conductorName == "Bernstein, Leonard") %>%
  filter(seasonYear < 0) %>%
  arrange(seasonYear)

saveRDS(BernsteinHeadNeg, file = "BernsteinHeadNeg.RDS")

BernsteinHeadPos <- lin_regs %>%
  unnest(tidied) %>%
  select(conductorName,composerName,term,estimate) %>%
  pivot_wider(names_from = term,values_from = estimate) %>%
  select(conductorName,composerName,seasonYear) %>%
  filter(!is.na(seasonYear)) %>%
  filter(conductorName == "Bernstein, Leonard") %>%
  filter(seasonYear > 0) %>%
  arrange(-seasonYear)

saveRDS(BernsteinHeadPos, file = "BernsteinHeadPos.RDS")

Bernstein lineari positivi (almeno dieci stagioni).

ccn %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons >= 10) %>%
  inner_join(BernsteinHeadPos, by = c("conductorName","composerName")) %>%
  ggplot() +
  geom_line(aes(seasonYear.x,perc,color=composerName), show.legend = FALSE) +
  geom_point(aes(seasonYear.x,perc,color=composerName), show.legend = FALSE) +
  facet_wrap(~composerName)

Bernstein lineari negativi. Almeno cinque stagioni.

ccn %>%
  group_by(conductorName,composerName) %>%
  mutate(seasons = n_distinct(seasonYear)) %>%
  filter(seasons >= 5) %>%
  inner_join(BernsteinHeadNeg, by = c("conductorName","composerName")) %>%
  ggplot() +
  geom_line(aes(seasonYear.x,perc,color=composerName), show.legend = FALSE) +
  geom_point(aes(seasonYear.x,perc,color=composerName), show.legend = FALSE) +
  facet_wrap(~composerName)

Anthem, legato ai tour.

performances %>%
  filter(composerName == "Anthem,") %>%
  ggplot() +
  geom_point(aes(Date,workTitle,color=eventType), show.legend = FALSE)

Anthem, cresce dal 1957 per i molti tour all’estero e nel 1918 per i tour nazionali.

performances %>%
  filter(composerName == "Anthem,") %>%
  filter(eventType == "Tour") %>%
  ggplot() +
  geom_histogram(aes(seasonYear), binwidth = 1) +
  geom_vline(aes(xintercept = 1957, color = "red"), show.legend = FALSE) +
  geom_vline(aes(xintercept = 1918, color = "red"), show.legend = FALSE)

Tour del 1918 in america, inno americano.

performances %>%
  filter(composerName == "Anthem,") %>%
  filter(eventType == "Tour") %>%
  filter(seasonYear == 1918) %>%
  ggplot() +
  geom_bar(aes(y = Location, fill = workTitle))

Tour del 1957 nel mondo.

performances %>%
  filter(composerName == "Anthem,") %>%
  filter(eventType == "Tour") %>%
  filter(seasonYear == 1957) %>%
  ggplot() +
  geom_bar(aes(y = Location, fill = workTitle))

Richard Wagner fino al 1950. Performance e direttori.

performances %>%
  filter(composerName == "Wagner, Richard") %>%
  filter(seasonYear <= 1950) %>%
  ggplot() +
  geom_jitter(aes(x = seasonYear, y = conductorName),alpha = 1/4)

Richard Wagner fino al 1950, otto maggiori direttori. Numero di performance e periodo di attività.

top_Wagner <- performances %>%
  filter(composerName == "Wagner, Richard") %>%
  filter(seasonYear <= 1950) %>%
  group_by(seasonYear,conductorName) %>%
  count() %>%
  group_by(conductorName) %>%
  mutate(total = sum(n)) %>%
  arrange(-total) %>%
  distinct(conductorName, total) %>%
  head(8) %>%
  mutate(group = conductorName)

colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", 
                       "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

performances %>%
  filter(composerName == "Wagner, Richard") %>%
  filter(seasonYear <= 1950) %>%
  inner_join(top_Wagner) %>%
  ggplot() +
  geom_area(aes(seasonYear,fill=conductorName), stat = "bin", binwidth = 1, show.legend = TRUE) +
  scale_fill_manual(values=colorBlindBlack8) +
  facet_wrap(~conductorName)
## Joining with `by = join_by(conductorName)`

Richard Wagner. Direttore Josef Stransky.

performances %>%
  filter(composerName == "Wagner, Richard") %>%
  filter(!is.na(conductorName)) %>%
  filter(seasonYear <= 1925) %>%
  mutate(
    group = ifelse(conductorName == "Stransky, Josef", "Stransky, Josef", "Altri conduttori")
  ) %>%
  ggplot() +
    geom_bar(aes(seasonYear,fill=group))

Aaron Copland dal 1950 al 1995. Performance e direttori.

performances %>%
  filter(composerName == "Copland, Aaron") %>%
  filter(seasonYear >= 1950 & seasonYear <= 1995) %>%
  ggplot() +
  geom_jitter(aes(x = seasonYear, y = conductorName),alpha = 1/4)

Aaron Copland dal 1950 al 1995, otto maggiori direttori. Numero di performance e periodo di attività.

top_Copland <- performances %>%
  filter(composerName == "Copland, Aaron") %>%
  filter(seasonYear >= 1950 & seasonYear <= 1995) %>%
  group_by(seasonYear,conductorName) %>%
  count() %>%
  group_by(conductorName) %>%
  mutate(total = sum(n)) %>%
  arrange(-total) %>%
  distinct(conductorName, total) %>%
  head(8) %>%
  mutate(group = conductorName)

performances %>%
  filter(composerName == "Copland, Aaron") %>%
  filter(seasonYear >= 1950 & seasonYear <= 1995) %>%
  inner_join(top_Copland) %>%
  ggplot() +
  geom_area(aes(seasonYear,fill=conductorName), stat = "bin", binwidth = 1, show.legend = TRUE) +
  scale_fill_manual(values=colorBlindBlack8) +
  facet_wrap(~conductorName)
## Joining with `by = join_by(conductorName)`